In [1]:
import os 
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:64"

In [2]:
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
%pip install transformers pandas scikit-learn pyarrow accelerate transformers[torch] ipywidgets tqdm 

Looking in indexes: https://download.pytorch.org/whl/cu121
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import pandas as pd
import torch
import logging
import os
import time
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, DistilBertTokenizer, DistilBertForSequenceClassification, AlbertTokenizer, AlbertForSequenceClassification, RobertaTokenizer, RobertaForSequenceClassification
from sklearn.model_selection import train_test_split
from transformers import Trainer, TrainingArguments
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.preprocessing import OneHotEncoder
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
from tqdm import tqdm
from datetime import datetime

In [4]:
torch.cuda.is_available()

True

In [5]:
log_filename = f'logs/training_log_{datetime.now().strftime("%Y-%m-%d_%H-%M-%S")}.log'
logging.basicConfig(filename=log_filename, filemode='w', format='%(asctime)s - %(message)s', level=logging.INFO)

In [6]:
def load_category_data(file_path: str, category: str):
    df = pd.read_feather(file_path)
    # add a column for the category
    df['category'] = category
    return df

In [7]:
model_map = {
  'bert-base-uncased': {
    'model': BertForSequenceClassification,
    'tokenizer': BertTokenizer,
  },
  'distilbert-base-uncased': {
    'model': DistilBertForSequenceClassification,
    'tokenizer': DistilBertTokenizer,
  },
  'distilbert-base-uncased-finetuned-sst-2-english': {
    'model': DistilBertForSequenceClassification,
    'tokenizer': DistilBertTokenizer,
  },
  'albert-base-v2': {
    'model': AlbertForSequenceClassification,
    'tokenizer': AlbertTokenizer,
  },
  'roberta-base': {
    'model': RobertaForSequenceClassification,
    'tokenizer': RobertaTokenizer,
  },
}

In [8]:
training_datasets = {
  'crime': ['FA-KES-Dataset'],
  'health': ['covid_FNIR', 'covid_fake_news_dataset'],
  'politics': ['pheme', 'liar_dataset', ],
  'science': ['climate_dataset'],
  'social_media': ['gossipcop']
}

testing_datasets = {
  'crime': ['snope'],
  'health': ['covid_claims'],
  'politics': ['politifact_dataset', 'fake_news_dataset'],
  'science': ['climate_dataset'],
  'social_media': ['isot_dataset']
}

In [9]:
# show info on the categories and datasets
training_data = pd.concat([load_category_data(os.path.join(os.path.realpath('.'), f'..\data\{category}\{dataset}.feather'), category) for category, datasets in training_datasets.items() for dataset in datasets])
testing_data = pd.concat([load_category_data(os.path.join(os.path.realpath('.'), f'..\data\{category}\{dataset}.feather'), category) for category, datasets in testing_datasets.items() for dataset in datasets])
print(f'Training data: {training_data.shape}')
print(f'Testing data: {testing_data.shape}')

Training data: (53818, 6)
Testing data: (90893, 5)


In [10]:
# keep only the text, category, and label columns
training_data = training_data[['text', 'category', 'label']]
training_data.dropna(inplace=True)
testing_data = testing_data[['text', 'category', 'label']]
testing_data.dropna(inplace=True)

In [11]:
# Match the number of samples for each category
df = training_data.groupby('label').apply(lambda x: x.sample(n=1000, random_state=42, replace=True)).groupby('category').apply(lambda x: x.sample(n=600, random_state=42, replace=True))
df = df.reset_index(drop=True)
df.groupby('category').count()

Unnamed: 0_level_0,text,label
category,Unnamed: 1_level_1,Unnamed: 2_level_1
crime,600,600
health,600,600
politics,600,600
science,600,600
social_media,600,600


In [12]:
df.groupby('label').count()

Unnamed: 0_level_0,text,category
label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1567,1567
1,1433,1433


In [13]:
df_texts = df['text'].to_list()
df_labels = df['label'].to_list()

In [14]:
# Split the dataset into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(df_texts, df_labels, test_size=0.2, random_state=7623)

In [15]:
model_name = 'distilbert-base-uncased'
model_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
logging.info('====================================================')
# load the tokenizer and model
tokenizer = model_map[model_name]['tokenizer'].from_pretrained(model_name)
model = model_map[model_name]['model'].from_pretrained(model_name, num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
# create the encodings
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True)

In [17]:
# Convert the encodings to PyTorch tensors
train_inputs = torch.tensor(train_encodings['input_ids'])
train_masks = torch.tensor(train_encodings['attention_mask'])
train_labels = torch.tensor(train_labels)

test_inputs = torch.tensor(test_encodings['input_ids'])
test_masks = torch.tensor(test_encodings['attention_mask'])
test_labels = torch.tensor(test_labels)

# Create a DataLoader for training and testing
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_dataloader = DataLoader(train_data, batch_size=8)

test_data = TensorDataset(test_inputs, test_masks, test_labels)
data_test_dataloader = DataLoader(test_data, batch_size=8)

In [18]:
# Move the model and data to the GPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
train_inputs, train_masks, train_labels = train_inputs.to(device), train_masks.to(device), train_labels.to(device)
test_inputs, test_masks, test_labels = test_inputs.to(device), test_masks.to(device), test_labels.to(device)

In [19]:
optimizer = AdamW(model.parameters(), lr=1e-5)
epochs = 1

# Fine-tune the pre-trained BERT model
train_start_time = time.time()
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs[0]
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1} average train loss: {avg_train_loss}")
    logging.info(f"Epoch {epoch+1} average train loss: {avg_train_loss}")
train_end_time = time.time()

100%|██████████| 300/300 [02:44<00:00,  1.83it/s]

Epoch 1 average train loss: 0.4847223972032468





In [20]:
torch.save(model.state_dict(), f'models/{model_name}-{model_time}.pt')

In [21]:
# Evaluate the model
eval_start_time = time.time()
model.eval()
predictions = []
for batch in data_test_dataloader:
    inputs = {'input_ids': batch[0].to(device), 'attention_mask': batch[1].to(device), 'labels': None}
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predictions.extend(logits.argmax(dim=1).cpu().tolist())
eval_end_time = time.time()

In [22]:
# Calculate additional metrics
precision = precision_score(test_labels.cpu(), predictions)
recall = recall_score(test_labels.cpu(), predictions)
f1 = f1_score(test_labels.cpu(), predictions)
accuracy = accuracy_score(test_labels.cpu(), predictions)
g_mean = (recall*accuracy)**0.5

In [23]:
# Log the results
logging.info(f"{model_name} {model_time} with {epochs} epochs: Evaluation Results:")
logging.info(f"Training time: {train_end_time - train_start_time} seconds")
logging.info(f"Inference time: {eval_end_time - eval_start_time} seconds")
logging.info(f"Precision: {precision}")
logging.info(f"Recall: {recall}")
logging.info(f"F-score: {f1}")
logging.info(f"Accuracy: {accuracy}")
logging.info(f"G-mean: {g_mean}")

# Print the evaluation results
print(f"{model_name} {model_time} with {epochs} epochs: Evaluation Results:")
print(f"Training time: {train_end_time - train_start_time} seconds")
print(f"Inference time: {eval_end_time - eval_start_time} seconds")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F-score: {f1}")
print(f"Accuracy: {accuracy}")
print(f"G-mean: {g_mean}")

distilbert-base-uncased 2023-12-21_15-16-58 with 1 epochs: Evaluation Results:
Training time: 164.36883211135864 seconds
Inference time: 17.819432258605957 seconds
Precision: 0.8493150684931506
Recall: 0.8671328671328671
F-score: 0.8581314878892733
Accuracy: 0.8633333333333333
G-mean: 0.8652310145994008


In [24]:
# Evaluate the model further by testing it with ones that are not in df but are in data
# get the data that is not in df but is in data

data_only = testing_data.groupby('label').apply(lambda x: x.sample(n=1000, random_state=42, replace=True)).groupby('category').apply(lambda x: x.sample(n=160, random_state=42, replace=True))

# Load the data
data_texts = data_only['text'].to_list()
data_labels = data_only['label'].to_list()

data_test_labels = torch.tensor(data_labels).to(device)

# Tokenize the data
data_test_encodings = tokenizer(list(data_texts), truncation=True, padding=True)

# Convert the encodings to PyTorch tensors

data_test_inputs = torch.tensor(data_test_encodings['input_ids'])
data_test_masks = torch.tensor(data_test_encodings['attention_mask'])

# Create a DataLoader for testing

data_test_data = TensorDataset(data_test_inputs, data_test_masks)
data_test_dataloader = DataLoader(data_test_data, batch_size=8)

# Evaluate the model
eval_start_time = time.time()
model.eval()
predictions = []
for batch in data_test_dataloader:
    inputs = {'input_ids': batch[0].to(device), 'attention_mask': batch[1].to(device), 'labels': None}
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predictions.extend(logits.argmax(dim=1).cpu().tolist())
eval_end_time = time.time()

# Calculate additional metrics
precision = precision_score(data_test_labels.cpu(), predictions)
recall = recall_score(data_test_labels.cpu(), predictions)
f1 = f1_score(data_test_labels.cpu(), predictions)
accuracy = accuracy_score(data_test_labels.cpu(), predictions)
g_mean = (recall*accuracy)**0.5

# Log the results
logging.info(f"{model_name} {model_time} with {epochs} epochs: Evaluation Results (completely new data):")
logging.info(f"Training time: {train_end_time - train_start_time} seconds")
logging.info(f"Inference time: {eval_end_time - eval_start_time} seconds")
logging.info(f"Precision: {precision}")
logging.info(f"Recall: {recall}")
logging.info(f"F-score: {f1}")
logging.info(f"Accuracy: {accuracy}")
logging.info(f"G-mean: {g_mean}")

# Print the evaluation results
print(f"{model_name} {model_time} with {epochs} epochs: Evaluation Results (completely new data):")
print(f"Training time: {train_end_time - train_start_time} seconds")
print(f"Inference time: {eval_end_time - eval_start_time} seconds")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F-score: {f1}")
print(f"Accuracy: {accuracy}")
print(f"G-mean: {g_mean}")

distilbert-base-uncased 2023-12-21_15-16-58 with 1 epochs: Evaluation Results (completely new data):
Training time: 164.36883211135864 seconds
Inference time: 24.223811149597168 seconds
Precision: 0.3878504672897196
Recall: 0.4368421052631579
F-score: 0.41089108910891087
Accuracy: 0.405
G-mean: 0.42061984336402747
