In [1]:
import os 
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:64"

In [2]:
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
%pip install transformers pandas scikit-learn pyarrow accelerate transformers[torch] ipywidgets tqdm 

Looking in indexes: https://download.pytorch.org/whl/cu121Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip






[notice] A new release of pip is available: 23.2.1 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip





In [3]:
import pandas as pd
import torch
import logging
import os
import time
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, DistilBertTokenizer, DistilBertForSequenceClassification, AlbertTokenizer, AlbertForSequenceClassification, RobertaTokenizer, RobertaForSequenceClassification
from sklearn.model_selection import train_test_split
from transformers import Trainer, TrainingArguments
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.preprocessing import OneHotEncoder
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
from tqdm import tqdm
from datetime import datetime

In [4]:
torch.cuda.is_available()

True

# Data processing

In [5]:
log_filename = f'logs/training_log_{datetime.now().strftime("%Y-%m-%d_%H-%M-%S")}.log'
logging.basicConfig(filename=log_filename, filemode='w', format='%(asctime)s - %(message)s', level=logging.INFO)

In [6]:
def load_category_data(file_path: str, category: str):
    df = pd.read_feather(file_path)
    # add a column for the category
    df['category'] = category
    return df

In [7]:
model_map = {
  'bert-base-uncased': {
    'model': BertForSequenceClassification,
    'tokenizer': BertTokenizer,
  },
  'distilbert-base-uncased': {
    'model': DistilBertForSequenceClassification,
    'tokenizer': DistilBertTokenizer,
  },
  'distilbert-base-uncased-finetuned-sst-2-english': {
    'model': DistilBertForSequenceClassification,
    'tokenizer': DistilBertTokenizer,
  },
  'albert-base-v2': {
    'model': AlbertForSequenceClassification,
    'tokenizer': AlbertTokenizer,
  },
  'roberta-base': {
    'model': RobertaForSequenceClassification,
    'tokenizer': RobertaTokenizer,
  },
}

In [8]:
categories = {
  'crime': ['FA-KES-Dataset', 'snope'],
  'health': ['covid_claims', 'covid_fake_news_dataset', 'covid_FNIR'],
  'politics': ['politifact_dataset', 'liar_dataset', 'fake_news_dataset', 'pheme'],
  'science': ['climate_dataset'],
  'social_media': ['isot_dataset', 'gossipcop']
}

In [9]:
# show info on the categories and datasets
data = pd.concat([load_category_data(os.path.join(os.path.realpath('.'), f'..\data\{category}\{dataset}.feather'), category) for category, datasets in categories.items() for dataset in datasets])
print(data.groupby('category').count())

               text  label  metadata  title  author
category                                           
crime          1119   1119      1119      0       0
health        13521  13528     13528   5939       0
politics      61173  61212     61212      0    6424
science         907    907       907      0       0
social_media  67038  67038     67038      0       0


In [10]:
# keep only the text and category columns
data = data[['text', 'category']]
data.dropna(inplace=True)
print(data.groupby('category').count())

               text
category           
crime          1119
health        13521
politics      61173
science         907
social_media  67038


In [11]:
data['encoded_category'] = data['category'].astype('category').cat.codes
data.head()


Unnamed: 0,text,category,encoded_category
0,Wed 05 Apr 2017 Syria attack symptoms consiste...,crime,0
1,Fri 07 Apr 2017 at 0914 Homs governor says U.S...,crime,0
2,Sun 16 Apr 2017 Death toll from Aleppo bomb at...,crime,0
3,Wed 19 Apr 2017 Aleppo bomb blast kills six Sy...,crime,0
4,Sun 10 Jul 2016 29 Syria Rebels Dead in Fighti...,crime,0


In [12]:
# Match the number of samples for each category
df = data.groupby('category').apply(lambda x: x.sample(n=850, random_state=42))
df = df.reset_index(drop=True)
df.groupby('category').count()

Unnamed: 0_level_0,text,encoded_category
category,Unnamed: 1_level_1,Unnamed: 2_level_1
crime,850,850
health,850,850
politics,850,850
science,850,850
social_media,850,850


In [13]:
df_texts = df['text'].to_list()
df_labels = df['encoded_category'].to_list()

# Model training

In [14]:
# Split the dataset into training and testing sets
train_texts, validation_texts, train_labels, validation_labels = train_test_split(df_texts, df_labels, test_size=0.2, random_state=7623)

In [15]:
model_name = 'distilbert-base-uncased'
model_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
logging.info('====================================================')
# load the tokenizer and model
tokenizer = model_map[model_name]['tokenizer'].from_pretrained(model_name)
model = model_map[model_name]['model'].from_pretrained(model_name, num_labels=len(categories))

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
# create the encodings
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
validation_encodings = tokenizer(list(validation_texts), truncation=True, padding=True)

In [17]:
# Convert the encodings to PyTorch tensors
train_inputs = torch.tensor(train_encodings['input_ids'])
train_masks = torch.tensor(train_encodings['attention_mask'])
train_labels = torch.tensor(train_labels)

validation_inputs = torch.tensor(validation_encodings['input_ids'])
validation_masks = torch.tensor(validation_encodings['attention_mask'])
validation_labels = torch.tensor(validation_labels)

# Create a DataLoader for training and testing
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_dataloader = DataLoader(train_data, batch_size=8)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_dataloader = DataLoader(validation_data, batch_size=8)

In [18]:
# Move the model and data to the GPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
train_inputs, train_masks, train_labels = train_inputs.to(device), train_masks.to(device), train_labels.to(device)
validation_inputs, validation_masks, validation_labels = validation_inputs.to(device), validation_masks.to(device), validation_labels.to(device)

In [19]:
optimizer = AdamW(model.parameters(), lr=1e-5)
epochs = 7

# Fine-tune the pre-trained BERT model
train_start_time = time.time()
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs[0]
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1} average train loss: {avg_train_loss}")
    logging.info(f"Epoch {epoch+1} average train loss: {avg_train_loss}")
train_end_time = time.time()

  0%|          | 0/425 [00:00<?, ?it/s]

 28%|██▊       | 119/425 [08:47<22:35,  4.43s/it]


KeyboardInterrupt: 

In [None]:
torch.save(model.state_dict(), f'models/{model_name}-{model_time}.pt')

# Model evaluation

In [None]:
# Evaluate the model
eval_start_time = time.time()
model.eval()
predictions = []
for batch in validation_dataloader:
    inputs = {'input_ids': batch[0].to(device), 'attention_mask': batch[1].to(device), 'labels': None}
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predictions.extend(logits.argmax(dim=1).cpu().tolist())
eval_end_time = time.time()

In [None]:
# Calculate additional metrics
precision = precision_score(validation_labels.cpu(), predictions, average='macro')
recall = recall_score(validation_labels.cpu(), predictions, average='macro')
f1 = f1_score(validation_labels.cpu(), predictions, average='macro')
accuracy = accuracy_score(validation_labels.cpu(), predictions)
g_mean = (recall*accuracy)**0.5

In [None]:
# Log the results
logging.info(f"{model_name} {model_time} with {epochs} epochs: Evaluation Results:")
logging.info(f"Training time: {train_end_time - train_start_time} seconds")
logging.info(f"Inference time: {eval_end_time - eval_start_time} seconds")
logging.info(f"Precision: {precision}")
logging.info(f"Recall: {recall}")
logging.info(f"F-score: {f1}")
logging.info(f"Accuracy: {accuracy}")
logging.info(f"G-mean: {g_mean}")

# Print the evaluation results
print(f"{model_name} {model_time} with {epochs} epochs: Evaluation Results:")
print(f"Training time: {train_end_time - train_start_time} seconds")
print(f"Inference time: {eval_end_time - eval_start_time} seconds")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F-score: {f1}")
print(f"Accuracy: {accuracy}")
print(f"G-mean: {g_mean}")

distilbert-base-uncased 2023-12-14_10-10-31 with 7 epochs: Evaluation Results:
Training time: 1758.0164849758148 seconds
Inference time: 31.92399549484253 seconds
Precision: 0.9687700728774725
Recall: 0.9679539862164452
F-score: 0.9682843239433515
Accuracy: 0.9682352941176471
G-mean: 0.9680946299492776


In [None]:
# Evaluate the model further by testing it with ones that are not in df but are in data
# get the data that is not in df but is in data
test_data = data[~data['text'].isin(df['text'])]

test_data = test_data.groupby('category').apply(lambda x: x.sample(n=160, random_state=42, replace=True))

# Load the data
test_texts = test_data['text'].to_list()
test_labels = test_data['encoded_category'].to_list()

# Tokenize the data
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True)

# Convert the encodings to PyTorch tensors

test_inputs = torch.tensor(test_encodings['input_ids'])
test_masks = torch.tensor(test_encodings['attention_mask'])

# Create a DataLoader for testing

test_data = TensorDataset(test_inputs, test_masks)
test_dataloader = DataLoader(test_data, batch_size=8)

# Evaluate the model
eval_start_time = time.time()
model.eval()
predictions = []
for batch in test_dataloader:
    inputs = {'input_ids': batch[0].to(device), 'attention_mask': batch[1].to(device), 'labels': None}
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predictions.extend(logits.argmax(dim=1).cpu().tolist())
eval_end_time = time.time()

# Calculate additional metrics
precision = precision_score(test_labels, predictions, average='macro')
recall = recall_score(test_labels, predictions, average='macro')
f1 = f1_score(test_labels, predictions, average='macro')
accuracy = accuracy_score(test_labels, predictions)
g_mean = (recall*accuracy)**0.5

# Log the results
logging.info(f"{model_name} {model_time} with {epochs} epochs: Evaluation Results (completely new data):")
logging.info(f"Training time: {train_end_time - train_start_time} seconds")
logging.info(f"Inference time: {eval_end_time - eval_start_time} seconds")
logging.info(f"Precision: {precision}")
logging.info(f"Recall: {recall}")
logging.info(f"F-score: {f1}")
logging.info(f"Accuracy: {accuracy}")
logging.info(f"G-mean: {g_mean}")

# Print the evaluation results
print(f"{model_name} {model_time} with {epochs} epochs: Evaluation Results (completely new data):")
print(f"Training time: {train_end_time - train_start_time} seconds")
print(f"Inference time: {eval_end_time - eval_start_time} seconds")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F-score: {f1}")
print(f"Accuracy: {accuracy}")
print(f"G-mean: {g_mean}")

distilbert-base-uncased 2023-12-14_10-10-31 with 7 epochs: Evaluation Results (completely new data):
Training time: 1758.0164849758148 seconds
Inference time: 38.20261216163635 seconds
Precision: 0.9674714467233013
Recall: 0.9678126143521029
F-score: 0.967560059860937
Accuracy: 0.968
G-mean: 0.9679063026413433


: 