In [1]:
%pip install numpy transformers pandas torch scikit-learn pyarrow accelerate transformers[torch] transformers[sentencepiece] ipywidgets tqdm datetime imblearn sentencepiece papermill

Note: you may need to restart the kernel to use updated packages.


In [2]:
import torch
import torch.nn as nn
import logging
import os
import time
import string
import random
import numpy as np
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, DistilBertTokenizer, DistilBertForSequenceClassification ,AlbertTokenizer, AlbertForSequenceClassification, RobertaTokenizer, RobertaForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from torch.utils.data import DataLoader, TensorDataset
from datetime import datetime

In [3]:
# Default parameters, Papermill will overwrite these
categories = 'science'
select_model = 0
freeze_layers_up_to = 0
weight_for_class_0 = 0.45
weight_for_class_1 = 0.55
learning_rate = 0.00001
min_acc = 0.85

In [4]:
# Parameters
freeze_layers_up_to = 0
weight_for_class_0 = 0.45
weight_for_class_1 = 0.55
learning_rate = 1e-05
min_acc = 0.8
select_model = 5
categories = "crime"


In [5]:
def generate_random_string(length=10):
    letters = string.ascii_letters + string.digits
    return ''.join(random.choice(letters) for _ in range(length))

run_id = generate_random_string()

In [6]:
log_dir_mapping = {
  'crime': './results/crime',
  'science': './results/science',
  'health': './results/health',
  'politics': './results/politics',
  'social_media': './results/social_media'
}

log_dir = log_dir_mapping.get(categories, './results')
os.makedirs(log_dir, exist_ok=True)

log_filename = f'training_log_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'
log_filepath = os.path.join(log_dir, log_filename)

logging.basicConfig(filename=log_filepath, filemode='w', format='%(asctime)s - %(message)s', level=logging.INFO)
logging.info(f'\nStarting detection model - {run_id}')

In [7]:
root_dir = '../data'

def load_data_from_category(category):
    files = os.listdir(os.path.join(root_dir, category))
    dataframes = []
    for file in files:
        if file.endswith('.feather'):
            df = pd.read_feather(os.path.join(root_dir, category, file))
            dataframes.append(df)
    return pd.concat(dataframes, ignore_index=True)

# Use this one when running multiple categories
#combined_dataframes = [load_data_from_category(category) for category in categories] 
# This one is for single category
combined_dataframes = [load_data_from_category(categories)]
combined_df = pd.concat(combined_dataframes, ignore_index=True)
combined_df.dropna(inplace=True)
print(combined_df)

                                                   text  label  \
0     Wed 05 Apr 2017 Syria attack symptoms consiste...      1   
1     Fri 07 Apr 2017 at 0914 Homs governor says U.S...      1   
2     Sun 16 Apr 2017 Death toll from Aleppo bomb at...      1   
3     Wed 19 Apr 2017 Aleppo bomb blast kills six Sy...      1   
4     Sun 10 Jul 2016 29 Syria Rebels Dead in Fighti...      1   
...                                                 ...    ...   
1114  Did Seven Key Witnesses to Las Vegas Shooting ...      1   
1115                               Australian Gun Stats      1   
1116                               Australian Gun Stats      1   
1117  Is a Murderous Taxi Driver Killing People in H...      1   
1118  Is a Murderous Taxi Driver Killing People in H...      1   

                                               metadata  
0     {'date': '4/5/2017', 'label': 1, 'location': '...  
1     {'date': '4/7/2017', 'label': 1, 'location': '...  
2     {'date': '4/16/2017', 'labe

In [8]:
'''
def load_data_from_category(category, filenames):
    dataframes = []
    for file_name in filenames:
        file_path = os.path.join(root_dir, category, file_name)
        if os.path.exists(file_path):
            df = pd.read_feather(file_path)
            dataframes.append(df)
        else:
            print(f"The file '{file_name}' in the '{category}' category does not exist.")
    return pd.concat(dataframes, ignore_index=True)

root_dir = '../data'
category = 'health'

# List of filenames to include in combined_df
#included_filenames = ['isot_dataset.feather', 'fake_news_dataset.feather', 'pheme.feather', 'liar_dataset.feather', 'politifact_dataset.feather']
included_filenames = ['covid_claims.feather', 'covid_fake_news_dataset.feather', 'covid_FNIR.feather']
combined_df = load_data_from_category(category, included_filenames)

# Drop NaN values
combined_df.dropna(inplace=True)

# Print the resulting DataFrame
print(combined_df)
'''

'\ndef load_data_from_category(category, filenames):\n    dataframes = []\n    for file_name in filenames:\n        file_path = os.path.join(root_dir, category, file_name)\n        if os.path.exists(file_path):\n            df = pd.read_feather(file_path)\n            dataframes.append(df)\n        else:\n            print(f"The file \'{file_name}\' in the \'{category}\' category does not exist.")\n    return pd.concat(dataframes, ignore_index=True)\n\nroot_dir = \'../data\'\ncategory = \'health\'\n\n# List of filenames to include in combined_df\n#included_filenames = [\'isot_dataset.feather\', \'fake_news_dataset.feather\', \'pheme.feather\', \'liar_dataset.feather\', \'politifact_dataset.feather\']\nincluded_filenames = [\'covid_claims.feather\', \'covid_fake_news_dataset.feather\', \'covid_FNIR.feather\']\ncombined_df = load_data_from_category(category, included_filenames)\n\n# Drop NaN values\ncombined_df.dropna(inplace=True)\n\n# Print the resulting DataFrame\nprint(combined_df)\n

In [9]:
# May need to include more columns to process metadata
texts = combined_df['text'].values
labels = combined_df['label'].values
num_classes = combined_df['label'].nunique()

print(texts[:5])
print(labels[:5])
print(f"Number of unique classes: {num_classes}")

unique_classes, class_counts = np.unique(labels, return_counts=True)

for class_label, count in zip(unique_classes, class_counts):
  print(f"Class {class_label}: {count} instances")


['Wed 05 Apr 2017 Syria attack symptoms consistent with nerve agent use WHO. Victims of a suspected chemical attack in Syria appeared to show symptoms consistent with reaction to a nerve agent the World Health Organization said on Wednesday. "Some cases appear to show additional signs consistent with exposure to organophosphorus chemicals a category of chemicals that includes nerve agents" WHO said in a statement putting the death toll at at least 70. The United States has said the deaths were caused by sarin nerve gas dropped by Syrian aircraft. Russia has said it believes poison gas had leaked from a rebel chemical weapons depot struck by Syrian bombs. Sarin is an organophosporus compound and a nerve agent. Chlorine and mustard gas which are also believed to have been used in the past in Syria are not. A Russian Defence Ministry spokesman did not say what agent was used in the attack but said the rebels had used the same chemical weapons in Aleppo last year. The WHO said it was likel

In [10]:
# Split the dataset into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=7623)

In [11]:
# Load the pre-trained BERT (or BERT variation) model and tokenizer
model_mapping = {
    0: (BertTokenizer, BertForSequenceClassification, 'bert-base-uncased'),
    1: (BertTokenizer, BertForSequenceClassification, 'bert-base-cased'),
    2: (DistilBertTokenizer, DistilBertForSequenceClassification, 'distilbert-base-uncased-finetuned-sst-2-english'),
    3: (DistilBertTokenizer, DistilBertForSequenceClassification, 'distilbert-base-uncased'),
    4: (RobertaTokenizer, RobertaForSequenceClassification, 'roberta-base'),
    5: (AlbertTokenizer, AlbertForSequenceClassification, 'albert-base-v2')
}

tokenizer_class, model_class, model_name = model_mapping.get(select_model, (None, None, None))

if tokenizer_class and model_class and model_name:
    tokenizer = tokenizer_class.from_pretrained(model_name)
    model = model_class.from_pretrained(model_name, num_labels=2)
else:
    logging.error(f"Invalid model selection: {select_model}")

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# Freeze the layers up to the specified layer
if freeze_layers_up_to > 0:
    for param in model.parameters():
        param.requires_grad = False

    if select_model == 0 or select_model == 1:
        print("Layers: "+str(len(model.bert.encoder.layer)))
        for param in model.bert.embeddings.parameters():
            param.requires_grad = True
    elif select_model == 2 or select_model == 3:
        print("Layers: "+str(len(model.distilbert.transformer.layer)))
        for param in model.distilbert.embeddings.parameters():
            param.requires_grad = True
    elif select_model == 4:
        print("Layers: "+str(len(model.roberta.encoder.layer)))
        for param in model.roberta.embeddings.parameters():
            param.requires_grad = True
    elif select_model == 5:
        print("Layers: "+str(len(model.albert.encoder.albert_layer_groups)))
        for param in model.albert.embeddings.parameters():
            param.requires_grad = True


In [13]:
# Tokenize the texts
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True)

In [14]:
# Convert the encodings to PyTorch tensors
train_inputs = torch.tensor(train_encodings['input_ids'])
train_masks = torch.tensor(train_encodings['attention_mask'])
train_labels = torch.tensor(train_labels)

test_inputs = torch.tensor(test_encodings['input_ids'])
test_masks = torch.tensor(test_encodings['attention_mask'])
test_labels = torch.tensor(test_labels)

# Create a DataLoader for training and testing
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_dataloader = DataLoader(train_data, batch_size=8)

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_dataloader = DataLoader(test_data, batch_size=8)

In [15]:
# Check for GPU availability
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using device: {device}')

Using device: cuda


In [16]:
# Move the model and data to the GPU
model.to(device)
train_inputs, train_masks, train_labels = train_inputs.to(device), train_masks.to(device), train_labels.to(device)
test_inputs, test_masks, test_labels = test_inputs.to(device), test_masks.to(device), test_labels.to(device)

# Define class weights based on class imbalance
class_weights = [weight_for_class_0, weight_for_class_1]

# Define the loss function with class weights
loss_fn = nn.CrossEntropyLoss(weight=torch.tensor(class_weights).to(device))

# Define the optimizer with a learning rate
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# Fine-tune the pre-trained BERT model
train_start_time = time.time()
model.train()
model.to(device)
for batch in train_dataloader:
    optimizer.zero_grad()
    inputs = {'input_ids': batch[0].to(device), 'attention_mask': batch[1].to(device), 'labels': batch[2].to(device)}
    outputs = model(**inputs)
    loss = loss_fn(outputs.logits, inputs['labels'])
    loss.backward()
    optimizer.step()
train_end_time = time.time()

In [17]:
# Evaluate the model
eval_start_time = time.time()
model.eval()
predictions = []
for batch in test_dataloader:
    inputs = {'input_ids': batch[0].to(device), 'attention_mask': batch[1].to(device), 'labels': None}
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predictions.extend(logits.argmax(dim=1).cpu().tolist())
eval_end_time = time.time()

In [18]:
# Calculate additional metrics
precision = precision_score(test_labels.cpu(), predictions)
recall = recall_score(test_labels.cpu(), predictions)
f1 = f1_score(test_labels.cpu(), predictions)
accuracy = accuracy_score(test_labels.cpu(), predictions)
g_mean = (recall*accuracy)**0.5

In [19]:
save_dir = './models'
os.makedirs(save_dir, exist_ok=True)

saved_model_name = f"bert_model_{run_id}.pt"
model_path = os.path.join(save_dir, saved_model_name)

# Only save model if accuracy meets minimum threshold
if accuracy > min_acc:
    torch.save(model.state_dict(), model_path)

In [20]:
# Log the results
logging.info("Evaluation Results")
logging.info(f"Training time: {train_end_time - train_start_time} seconds")
logging.info(f"Inference time: {eval_end_time - eval_start_time} seconds")
logging.info(f"Precision: {precision}")
logging.info(f"Recall: {recall}")
logging.info(f"F-score: {f1}")
logging.info(f"Accuracy: {accuracy}")
logging.info(f"G-mean: {g_mean}")
logging.info("Additional Info")
logging.info(f"Model name: {model_name}")
logging.info(f"Datasets list: {categories}")
logging.info(f"Layers frozen: {freeze_layers_up_to}")
logging.info(f"Learning rate: {learning_rate}")
logging.info(f"Class weights: {class_weights}")
if accuracy > min_acc:
  logging.info(f"Model saved to: {model_path}")
else:
  logging.info("Model not saved, didn't meet minimum accuracy threshold")

# Print the evaluation results
print("Evaluation Results:")
print(f"Training time: {train_end_time - train_start_time} seconds")
print(f"Inference time: {eval_end_time - eval_start_time} seconds")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F-score: {f1}")
print(f"Accuracy: {accuracy}")
print(f"G-mean: {g_mean}")

Evaluation Results:
Training time: 67.0369017124176 seconds
Inference time: 6.705554723739624 seconds
Precision: 0.5
Recall: 0.3333333333333333
F-score: 0.4
Accuracy: 0.5982142857142857
G-mean: 0.4465476031788346
